rm(list = ls())

library('rstan')
library('bayesplot')
library('ggplot2')
library('BART')

bayesplot_theme_set(theme_default(base_size = 24, base_family = "sans"))

prepare data

# edited to add "escs" and "wealth" to the data
data_full <- readRDS("./data/cleaned/cleaned_data.Rds")
# only consider AUS - for better balance between treatment/control groups
data_AUS <- data_full[data_full$country == 'AUS', ]
table(data_AUS$public_private)
## 
## private  public 
##    5210    6596
head(data_AUS)
## # A tibble: 6 × 14
##   country school_id student_id gender computer internet  math  read science
##   <fct>   <fct>     <fct>      <fct>  <fct>    <fct>    <dbl> <dbl>   <dbl>
## 1 AUS     3600002   3600210    female yes      yes       541.  611.    530.
## 2 AUS     3600002   3602562    female yes      yes       431.  469.    454.
## 3 AUS     3600002   3602765    male   yes      yes       617.  642.    659.
## 4 AUS     3600002   3603797    female yes      yes       359.  449.    401.
## 5 AUS     3600002   3606990    female yes      yes       475.  440.    443.
## 6 AUS     3600002   3608618    female yes      yes       436.  498.    420.
## # ℹ 5 more variables: stu_wgt <dbl>, wealth <dbl>, escs <dbl>,
## #   public_private <fct>, family_class <chr>
data <- data_AUS[, c("gender", "computer", "internet", "math", "read", "science", "public_private", "family_class", "wealth", "escs", "stu_wgt")]

# one-hot encoding of binary variables
data$gender <- as.numeric(data$gender == 'female')
data$computer <- as.numeric(data$computer == "yes")
data$internet <- as.numeric(data$internet == "yes")
data$public_private <- as.numeric(data$public_private == "private")
data$family_class <- as.numeric((data$family_class == "middle class"))

# combine "computer" and "internet" into "tech_access"
data$tech_access <- data$computer * data$internet

# log transformation of the scores - will transorm back when making predictions
data$math_log <- log(data$math)
data$read_log <- log(data$read)
data$science_log <- log(data$science)

head(data)
## # A tibble: 6 × 15
##   gender computer internet  math  read science public_private family_class
##    <dbl>    <dbl>    <dbl> <dbl> <dbl>   <dbl>          <dbl>        <dbl>
## 1      1        1        1  541.  611.    530.              1            1
## 2      1        1        1  431.  469.    454.              1            1
## 3      0        1        1  617.  642.    659.              1            1
## 4      1        1        1  359.  449.    401.              1            1
## 5      1        1        1  475.  440.    443.              1            1
## 6      1        1        1  436.  498.    420.              1            1
## # ℹ 7 more variables: wealth <dbl>, escs <dbl>, stu_wgt <dbl>,
## #   tech_access <dbl>, math_log <dbl>, read_log <dbl>, science_log <dbl>
X <- data[, c(1, 8:10, 12)]
X[,"Offset"] <- 1
t <- data$public_private

treat_ind <- data$public_private==1
control_ind <- data$public_private==0

X_treat <- X[treat_ind, ]
X_control <- X[control_ind, ]

head(X_treat)
## # A tibble: 6 × 6
##   gender family_class wealth   escs tech_access Offset
##    <dbl>        <dbl>  <dbl>  <dbl>       <dbl>  <dbl>
## 1      1            1  0.852  0.101           1      1
## 2      1            1  0.664 -0.793           1      1
## 3      0            1 -0.267  0.166           1      1
## 4      1            1  1.31   0.726           1      1
## 5      1            1  0.901 -0.170           1      1
## 6      1            1  1.10   1.21            1      1
head(X_control)
## # A tibble: 6 × 6
##   gender family_class wealth   escs tech_access Offset
##    <dbl>        <dbl>  <dbl>  <dbl>       <dbl>  <dbl>
## 1      1            0  0.168 -0.170           1      1
## 2      0            0 -0.887 -0.337           1      1
## 3      0            1  0.536 -1.13            1      1
## 4      1            0  0.541 -0.898           1      1
## 5      0            1  0.146  0.769           1      1
## 6      0            1 -1.25  -1.41            0      1
element_prod <- function(input_matrix, vector){
  row_num = nrow(input_matrix)
  col_num = ncol(input_matrix)
  output_matrix = matrix(nrow=row_num, ncol=col_num)
  for (i in 1:row_num){
    output_matrix[i, ] = input_matrix[i, ] * vector
  }
  output_matrix
}

initialize models

# # outcome model 1 - bayesian linear regression
# outcome_normal_code = "
# data {
# int<lower=0> N;      // number of data items
# int<lower=0> N_test;   // number of test data items
# int<lower=0> K;       // number of predictors
# real<lower=0> pr_sd; // std dev of the prior
# matrix[N, K] x;       // predictor matrix
# real y[N];          // output vector
# matrix[N_test, K] x_test;   // predictor matrix
# }
# 
# parameters {
#   vector[K] beta;       // coefficients for predictors
#   real<lower=0> sigma;  // error scale
# }
# 
# transformed parameters {
#   vector[N] mu = x * beta;
# }
# 
# model {
#   beta ~ normal(0,pr_sd);  // Note: beta is k-dim
#   sigma ~ inv_gamma(0.001,0.001);  // Close to a flat prior
#   y ~ normal(mu,sigma);  // likelihood
# }
# 
# generated quantities {
#   real y_rep[N_test];
#   y_rep = normal_rng(x_test * beta, sigma);
# } "
# 
# outcome_normal = stan_model(model_code=outcome_normal_code)
# propensity score model
ps_logis_code = "
data {
  int<lower=0> N;      // number of data items
  int<lower=0> K;      // number of predictors
  real<lower=0> pr_sd; // std dev of the prior
  matrix[N, K] x;      // predictor matrix
  int<lower=0, upper=1> y[N];      // Binary outcome variable
}

parameters {
  vector[K] beta;                  // Regression coefficients
}

transformed parameters {
  vector[N] mu = x * beta;
}

model {
  // Prior for beta
  beta ~ normal(0, pr_sd);           
  
  // Likelihood
  y ~ bernoulli_logit(mu);   // Logistic regression likelihood
}

generated quantities {
  int<lower=0, upper=1> y_rep[N];
  y_rep = bernoulli_logit_rng(mu);
} "

ps_logis = stan_model(model_code=ps_logis_code)
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DUSE_STANC3 -DSTRICT_R_HEADERS  -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION  -D_HAS_AUTO_PTR_ETC=0  -include '/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/usr/local/include   -fPIC  -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Core:19:
## /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: 'cmath' file not found
## #include <cmath>
##          ^~~~~~~
## 1 error generated.
## make: *** [foo.o] Error 1

causal estimate

dr_estimate <- function(y, t, y0_pred, y1_pred, ps){
  y_pred = element_prod(y0_pred, 1-t) + element_prod(y1_pred, t)
  ps_pred = element_prod(1-ps, 1-t) + element_prod(ps, t) 
  temp1 = y1_pred - y0_pred
  temp2 = y - y_pred
  dr = temp1 + temp2/ps_pred
}

propensity score model

ps_data <- list(N = nrow(X), K = ncol(X), pr_sd = 10, x=X, y=t)
ps_fit <- sampling(ps_logis, data=ps_data, iter=5000, warmup=4000, chains=1)
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.00074 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 7.4 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration:  500 / 5000 [ 10%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1500 / 5000 [ 30%]  (Warmup)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2500 / 5000 [ 50%]  (Warmup)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Warmup)
## Chain 1: Iteration: 3500 / 5000 [ 70%]  (Warmup)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Warmup)
## Chain 1: Iteration: 4001 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 4500 / 5000 [ 90%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 37.971 seconds (Warm-up)
## Chain 1:                12.484 seconds (Sampling)
## Chain 1:                50.455 seconds (Total)
## Chain 1:
ps <- 1/(1 + exp(-extract(ps_fit)$mu))
# check mixing 
class(ps_fit)
## [1] "stanfit"
## attr(,"package")
## [1] "rstan"
post_smp <- as.data.frame(ps_fit)
colnames(post_smp)[1:5] <- colnames(X)
## Warning in colnames(post_smp)[1:5] <- colnames(X): number of items to replace
## is not a multiple of replacement length
#hist(post_smp$TV,50)
mcmc_areas(post_smp[,1:5], pars = colnames(X)[1:5], prob = 0.8)

plot(post_smp$gender, type='l')

plot(post_smp$family_class, type='l')

plot(post_smp$tech_access, type='l')

plot(post_smp$wealth, type='l')

plot(post_smp$escs, type='l')

# predictive check on the propensity score model
y_rep = extract(ps_fit)$y_rep
# One posterior predictive check compares the number of 0's and 1's in the observed vs predictive datasets
ppc_bars(y=t,yrep=y_rep)+ggplot2::labs(y = "Number of 0's and 1's in observed and Predictive datasets")

# Another just plots the proportions of 1's
ppc_stat(y=t,yrep=y_rep)+ggplot2::labs(x = "Proportion of 1's in observed and Predictive datasets")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

outcome models

math

y <- data$math_log
y_treat <- y[treat_ind]
y_control <- y[control_ind]
### bayesian linear regression model
# math_outcome_treat <- list(N = nrow(X_treat), 
#                            N_test = nrow(X),
#                            K = ncol(X_treat), 
#                            pr_sd = 10, 
#                            x = X_treat, 
#                            y = y_treat,
#                            x_test = X)
# math_outcome_treat_regress <- sampling(outcome_normal, data=math_outcome_treat, iter=5000, warmup=4000, chains=4)
# # check mixing
# class(math_outcome_treat_regress)
# post_smp <- as.data.frame(math_outcome_treat_regress)
# colnames(post_smp)[1:5] <- colnames(X)
# 
# mcmc_areas(post_smp[,1:5], pars = colnames(X)[1:5], prob = 0.8)
# 
# plot(post_smp$gender, type='l')
# plot(post_smp$family_class, type='l')
# plot(post_smp$tech_access, type='l')
# # predictive check
# y1_pred <- extract(math_outcome_treat_regress)$y_rep
# 
# ppd_intervals(y1_pred[ ,treat_ind], x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1)  +
#   ggplot2::labs(y = "Predicted y's", x = "Observed y's")
# 
# ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0)  +
#   ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's")
math_outcome_treat <- wbart(x.train=as.matrix(X_treat[1:5]), 
                            y.train = y_treat, 
                            x.test=as.matrix(X[1:5]),
                            w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 5210, 5, 11806
## y1,yn: 0.072444, -0.096642
## x1,x[n*p]: 1.000000, 0.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.032790,3.000000,0.005912
## *****sigma: 0.174221
## *****w (weights): 9.906390 ... 10.067360
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 38s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y1_pred <- math_outcome_treat$yhat.test

ppd_intervals(y1_pred[ ,treat_ind],x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Math: Predictive check for treatment group")

ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title="Math: Predictive check for control group")

BART is slightly better than bayesian linear regression model (smaller variance) - use BART as outcome modeling.

math_outcome_control <- wbart(x.train=as.matrix(X_control[1:5]), 
                              y.train = y_control, 
                              x.test=as.matrix(X[1:5]), 
                              w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 6596, 5, 11806
## y1,yn: -0.034944, 0.102562
## x1,x[n*p]: 1.000000, 1.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.030787,3.000000,0.006994
## *****sigma: 0.189487
## *****w (weights): 9.906390 ... 20.005300
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 51s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y0_pred <- math_outcome_control$yhat.test

ppd_intervals(y0_pred[ ,control_ind],x=y[control_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Math: Predictive check for control group")

ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]), x=y[control_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title = "Math: Predictive check for control group")

read

y <- data$read_log
y_treat <- y[treat_ind]
y_control <- y[control_ind]
read_outcome_treat <- wbart(x.train=as.matrix(X_treat[1:5]), 
                            y.train = y_treat, 
                            x.test=as.matrix(X[1:5]),
                            w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 5210, 5, 11806
## y1,yn: 0.171592, -0.356653
## x1,x[n*p]: 1.000000, 0.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.028634,3.000000,0.007594
## *****sigma: 0.197444
## *****w (weights): 9.906390 ... 10.067360
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 44s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y1_pred <- read_outcome_treat$yhat.test

ppd_intervals(y1_pred[ ,treat_ind],x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for treatment group")

ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title="Read: Predictive check for control group")

read_outcome_control <- wbart(x.train=as.matrix(X_control[1:5]), 
                              y.train = y_control, 
                              x.test=as.matrix(X[1:5]), 
                              w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 6596, 5, 11806
## y1,yn: 0.158013, -0.005358
## x1,x[n*p]: 1.000000, 1.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.034070,3.000000,0.009718
## *****sigma: 0.223363
## *****w (weights): 9.906390 ... 20.005300
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 51s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y0_pred <- read_outcome_control$yhat.test

ppd_intervals(y0_pred[ ,control_ind],x=y[control_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for control group")

ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]), x=y[control_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title = "Read: Predictive check for control group")

science

y <- data$science_log
y_treat <- y[treat_ind]
y_control <- y[control_ind]
science_outcome_treat <- wbart(x.train=as.matrix(X_treat[1:5]), 
                               y.train = y_treat,
                               x.test=as.matrix(X[1:5]),
                               w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 5210, 5, 11806
## y1,yn: 0.030477, -0.223005
## x1,x[n*p]: 1.000000, 0.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.029797,3.000000,0.006803
## *****sigma: 0.186884
## *****w (weights): 9.906390 ... 10.067360
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 41s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y1_pred <- science_outcome_treat$yhat.test

ppd_intervals(y1_pred[ ,treat_ind],x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for treatment group")

ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title="Read: Predictive check for control group")

science_outcome_control <- wbart(x.train=as.matrix(X_control[1:5]), 
                                 y.train = y_control, 
                                 x.test=as.matrix(X[1:5]), 
                                 w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 6596, 5, 11806
## y1,yn: -0.079194, -0.131055
## x1,x[n*p]: 1.000000, 1.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.028021,3.000000,0.008300
## *****sigma: 0.206423
## *****w (weights): 9.906390 ... 20.005300
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 49s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y0_pred <- science_outcome_control$yhat.test

ppd_intervals(y0_pred[ ,control_ind],x=y[control_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for control group")

ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]), x=y[control_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title = "Read: Predictive check for control group")

calculate CATE

math

y <- data$math
y0_pred <- exp(math_outcome_control$yhat.test)
y1_pred <- exp(math_outcome_treat$yhat.test)

causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)

group1_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==1)
group2_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==1)
group3_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==1)
group4_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==1)
group5_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==0)
group6_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==0)
group7_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==0)
group8_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==0)

cate1 <- rowMeans(causal_contrasts[, group1_ind])
cate2 <- rowMeans(causal_contrasts[, group2_ind])
cate3 <- rowMeans(causal_contrasts[, group3_ind])
cate4 <- rowMeans(causal_contrasts[, group4_ind])
cate5 <- rowMeans(causal_contrasts[, group5_ind])
cate6 <- rowMeans(causal_contrasts[, group6_ind])
cate7 <- rowMeans(causal_contrasts[, group7_ind])
cate8 <- rowMeans(causal_contrasts[, group8_ind])
cate_math <- data.frame(cate1, cate2, cate3, cate4, cate5, cate6, cate7, cate8)
colnames(cate_math) <-  c("female, middle, tech", 
                     "male, middle, tech", 
                     "female, working, tech", 
                     "male, working, tech", 
                     "female, middle, no_tech", 
                     "male, middle, no_tech", 
                     "female, working, no_tech", 
                     "male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_math, horizontal=TRUE, las=1, main="Math: Conditional Average Treatment Effect")

#boxplot(cate, las=2)

read

y <- data$read
y0_pred <- exp(read_outcome_control$yhat.test)
y1_pred <- exp(read_outcome_treat$yhat.test)

causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)

group1_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==1)
group2_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==1)
group3_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==1)
group4_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==1)
group5_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==0)
group6_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==0)
group7_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==0)
group8_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==0)

cate1 <- rowMeans(causal_contrasts[, group1_ind])
cate2 <- rowMeans(causal_contrasts[, group2_ind])
cate3 <- rowMeans(causal_contrasts[, group3_ind])
cate4 <- rowMeans(causal_contrasts[, group4_ind])
cate5 <- rowMeans(causal_contrasts[, group5_ind])
cate6 <- rowMeans(causal_contrasts[, group6_ind])
cate7 <- rowMeans(causal_contrasts[, group7_ind])
cate8 <- rowMeans(causal_contrasts[, group8_ind])
cate_read <- data.frame(cate1, cate2, cate3, cate4, cate5, cate6, cate7, cate8)
colnames(cate_read) <-  c("female, middle, tech", 
                     "male, middle, tech", 
                     "female, working, tech", 
                     "male, working, tech", 
                     "female, middle, no_tech", 
                     "male, middle, no_tech", 
                     "female, working, no_tech", 
                     "male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_read, horizontal=TRUE, las=1, main="Read: Conditional Average Treatment Effect")

#boxplot(cate, las=2)

##science

y <- data$science
y0_pred <- exp(science_outcome_control$yhat.test)
y1_pred <- exp(science_outcome_treat$yhat.test)

causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)

group1_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==1)
group2_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==1)
group3_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==1)
group4_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==1)
group5_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==0)
group6_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==0)
group7_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==0)
group8_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==0)

cate1 <- rowMeans(causal_contrasts[, group1_ind])
cate2 <- rowMeans(causal_contrasts[, group2_ind])
cate3 <- rowMeans(causal_contrasts[, group3_ind])
cate4 <- rowMeans(causal_contrasts[, group4_ind])
cate5 <- rowMeans(causal_contrasts[, group5_ind])
cate6 <- rowMeans(causal_contrasts[, group6_ind])
cate7 <- rowMeans(causal_contrasts[, group7_ind])
cate8 <- rowMeans(causal_contrasts[, group8_ind])
cate_science <- data.frame(cate1, cate2, cate3, cate4, cate5, cate6, cate7, cate8)
colnames(cate_science) <-  c("female, middle, tech", 
                     "male, middle, tech", 
                     "female, working, tech", 
                     "male, working, tech", 
                     "female, middle, no_tech", 
                     "male, middle, no_tech", 
                     "female, working, no_tech", 
                     "male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_science, horizontal=TRUE, las=1, main="Read: Conditional Average Treatment Effect")

#boxplot(cate, las=2)

some visualizations

hist(X_treat$wealth, col = rgb(0, 0, 1, 0.5), xlim = c(-7, 7), ylim=c(0, 1600),
     main = "wealth", xlab = "wealth", breaks = 30)
hist(X_control$wealth, col = rgb(1, 0, 0, 0.5), add = TRUE, breaks = 30)

legend("topright", legend = c("private", "public"), 
       fill = c(rgb(0, 0, 1, 0.5), rgb(1, 0, 0, 0.5)))

hist(X_treat$escs, col = rgb(0.2, 0.8, 0.2, 0.5), xlim = c(-7, 7), ylim=c(0, 700),
     main = "escs", xlab = "escs", breaks = 30)
hist(X_control$escs, col = rgb(1, 0.6, 0, 0.5), add = TRUE, breaks = 30)

legend("topright", legend = c("private", "public"), 
       fill = c(rgb(0.2, 0.8, 0.2, 0.5), rgb(1, 0.6, 0, 0.5)))

colors = c(rgb(0, 0, 1, 0.5), rgb(0, 1, 1, 0.5))
variables <- c("gender", "family_class", "tech_access")
group <- c("private", "public")
 
# Create the matrix of the values.
Values <- matrix(c(sum(X_treat$gender)/nrow(X_treat), 
                   sum(X_treat$family_class)/nrow(X_treat), 
                   sum(X_treat$tech_access)/nrow(X_treat),
                   sum(X_control$gender)/nrow(X_control), 
                   sum(X_control$family_class)/nrow(X_control),
                   sum(X_control$tech_access)/nrow(X_control)),
                 nrow = 2, ncol = 3, byrow = TRUE)
 
# Create the bar chart
barplot(Values, main = "gender, family_class, tech_access", names.arg = variables,
                        xlab = "variables", ylab = "percentage",
                        col = colors, beside = TRUE)
 
# Add the legend to the chart
legend("topleft", group, cex = 0.7, fill = colors)

num <- c(sum(group1_ind),
         sum(group2_ind),
         sum(group3_ind),
         sum(group4_ind),
         sum(group5_ind),
         sum(group6_ind),
         sum(group7_ind),
         sum(group8_ind))
label <- c("female, middle, tech", 
           "male, middle, tech", 
           "female, working, tech", 
           "male, working, tech", 
           "female, middle, no_tech", 
           "male, middle, no_tech", 
           "female, working, no_tech", 
           "male, working, no_tech")
par(mar = c(0.5, 0.5, 0.5, 0.5))
pie(num, label, radius=0.8)